"""
#
# Flow stability for dynamic community detection https://arxiv.org/abs/2101.06131v2
#
# Copyright (C) 2021 Alexandre Bovet <alexandre.bovet@maths.ox.ac.uk>
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.


This script creates and saves the dataframe df_author_journal which contains
info about in which journals each author published.

saves results in `df_author_journal.csv`

"""

import pandas as pd
nproc = 6
from collections import Counter
from tqdm import tqdm

tqdm.pandas()

raise Exception
#%% 

df_edges = pd.read_csv('../data/aps/all_journals_disamb_edges.csv.gz', index_col=0)

#%%

df_authors_dois = pd.DataFrame(data={'doi':df_edges.doi.tolist() +\
                                          df_edges.doi.tolist(),
                                          'author':df_edges.n1.tolist() +\
                                              df_edges.n2.tolist()})
df_authors_dois.drop_duplicates(['doi','author'], inplace=True)

df_authors_dois['journal'] = df_authors_dois.doi.progress_apply(lambda s: s.split('/')[1].split('.')[0])

#%%

def find_journal(df):
    
    try:
        
        c = Counter(df.journal.tolist())
                    
        journal_counts = ','.join([f'{journal}:{count}' for journal,count in c.most_common() if isinstance(journal, str)])

        top_journal = journal_counts.split(':')[0]

        top_non_prl = top_journal
        i = 0
        while top_non_prl in ['PhysRevLett','PhysRevSeriesI',
                              'PhysRev', 'RevModPhys'] and i < len(c.most_common()):
            top_non_prl = c.most_common()[i][0]
            i += 1
        
        if top_non_prl in ['PhysRevLett','PhysRevSeriesI',
                              'PhysRev', 'RevModPhys']:
            top_non_prl = None
            
        
        return pd.DataFrame(data={'top_journal' : top_journal,
                'top_non_prl' : top_non_prl,
                'journal_counts' : journal_counts},
                            index=[df.author.iloc[0]])
                            
        
    except Exception as e:
        print(df)
        raise e

df_auth_doi = df_authors_dois.iloc[:].groupby('author').progress_apply(find_journal)

#%%
df_auth_doi.index = df_auth_doi.index.droplevel(level=1)

df_auth_doi.to_csv('../data/aps/df_author_journal.csv', sep=';')
        
        
    
